library(tm)
library(tidytext)
library(tidyverse)
library(DT)

Step 1 - Load the data to be cleaned and processed

urldata<-'https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/cleaned_hm.csv'
hm_data <- read_csv(urldata)
head(hm_data)

Step 2 - clean the text

corpus <- VCorpus(VectorSource(hm_data$cleaned_hm))
head(corpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 6
## Eliminating Extra Whitespace
corpus <- tm_map(corpus, stripWhitespace)
## Convert to Lower Case
corpus <- tm_map(corpus, content_transformer(tolower))
## Remove punctuation
corpus<- tm_map(corpus, removePunctuation)
## Remove punctuation numbers
corpus<- tm_map(corpus, removeNumbers)
## Remove empty words
corpus<- tm_map(corpus, removeWords, character(0))

Step 3 - Stemming words and converting tm object to tidy object

Stemming reduces a word to its word stem. We stem the words here and then convert the “tm” object to a “tidy” object for much faster processing.

stem <- tm_map(corpus, stemDocument) ##extrat stems
tidy_corpus <- tidy(stem) ##convert the "tm" object to a "tidy" object for much faster processing
stem_corpus <- select(tidy_corpus, text) ## only need text

Step 4 - Creating tidy format of the dictionary to be used for completing stems

We also need a dictionary to look up the words corresponding to the stems.

dict0 <- tidy(corpus) 
text_dict0 <- select(dict0,text)
dict<- unnest_tokens(text_dict0,dictionary, text) ##split sentence 
## into words and collect them

Step 5 - add custom stopwords in context of our data

data("stop_words") ## use "stop_words"
word <- c("happy","ago","yesterday","tomorrow","lot","today","months","month","happier","happiest","last","week","day","past") ## add 
## some stopwords

stop_words <- stop_words %>%
  bind_rows(mutate(tibble(word), lexicon = "added")) ## update 
##stopwords by add some words we think is not important

Step 6 - Combining stems and dictionary into the same tibble

Here we combine the stems and the dictionary into the same “tidy” object.

completed_corpus1 <- stem_corpus %>%
  mutate(id = row_number()) %>% ## rank according to row number
  unnest_tokens(stems, text) %>%
  bind_cols(dict) %>% ## bind stems and dictionary
  anti_join(stop_words, by = c("dictionary" = "word"))## remove
## the words in stop_words

Step 7 - Stem completion

Lastly, we complete the stems by picking the corresponding word with the highest frequency.

completed_corpus2 <- completed_corpus1 %>%
  group_by(stems) %>% ## group by "stems"
  count(dictionary) %>% ## count the corresponding word in dict
  mutate(word = dictionary[which.max(n)]) %>% ## keep the words
  ## with highest frequency 
  ungroup() %>%
  select(stems, word) %>%
  distinct() %>% ## delete the repeated ones and make one stem
  ## correspondeing to one word
  right_join(completed_corpus1) ## join with completed_corpus1

completed_corpus2 <- select(completed_corpus2, -stems)## remove stem

Step 8 - Pasting stem completed individual words into their respective happy moments

We want our processed words to resemble the structure of the original happy moments. So we paste the words together to form happy moments.

completed_corpus3 <- completed_corpus2 %>%
  group_by(id) %>%
  summarise(text = str_c(word, collapse = " ")) %>%
  ungroup()

Step 9 - Keeping a track of the happy moments with their own ID

hm_data <- hm_data %>%
  mutate(id = row_number()) %>%
  inner_join(completed_corpus3) ##connect 

datatable(hm_data) ##show the final data

Exporting the processed text data into a CSV file

write_csv(hm_data, "/Users/apple/Documents/GitHub/Spring2019-Proj1-yushen0922/output/processed_moments.csv")

The final processed data is ready to be used for any kind of analysis.